load(file = "gsea_output.RData")
# Add variable containing number of genes in leading edge subset
score_df$nLeadingEdge <- as.numeric(lapply(score_df$leadingEdge, length))
Load .RData file from gsea_source.R analysis.
# Enrichment score distribution
hist(score_df$ES, breaks = 50)
# Normalized Enrichment score distribution
hist(score_df$NES, breaks = 50)
# Nominal P Value distribution
hist(score_df$pval, breaks = 50)
# FDR q value distribution
hist(score_df$padj, breaks = 50)
# Number of more extreme gene sets distribution
hist(score_df$nMoreExtreme, breaks = 50)
# Size of gene set distribution
hist(score_df$size, breaks = 50)
# Size of leading edge subset distribution
hist(score_df$nLeadingEdge, breaks = 50)
We created histograms of each of the numeric variables from the GSEA output. This demonstrated the bimodal distributions for ES and NES as well as the changes in distributions between the nominal p-values and fdr values.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Filter data set using fdr cutoff
cutoff <- 1
filter_scores <- score_df %>%
filter(padj <= cutoff)
summary(filter_scores[,c(2:7,9)])
## pval padj ES NES
## Min. :0.000999 Min. :0.01413 Min. :-0.9699 Min. :-2.3518
## 1st Qu.:0.056920 1st Qu.:0.22727 1st Qu.:-0.3132 1st Qu.:-0.9509
## Median :0.335093 Median :0.66828 Median : 0.2795 Median : 0.7641
## Mean :0.388967 Mean :0.57319 Mean : 0.1297 Mean : 0.3322
## 3rd Qu.:0.688758 3rd Qu.:0.91816 3rd Qu.: 0.4920 3rd Qu.: 1.3075
## Max. :1.000000 Max. :1.00000 Max. : 0.9620 Max. : 2.8764
## nMoreExtreme size nLeadingEdge
## Min. : 0.0 Min. : 5.00 Min. : 1.00
## 1st Qu.: 23.0 1st Qu.: 10.00 1st Qu.: 4.00
## Median :147.0 Median : 19.00 Median : 7.00
## Mean :202.8 Mean : 45.01 Mean : 13.93
## 3rd Qu.:324.2 3rd Qu.: 41.00 3rd Qu.: 13.00
## Max. :919.0 Max. :1639.00 Max. :362.00
# Size of gene set vs size of leading edge subset
size_map <- ggplot(score_df,
mapping = aes(size, nLeadingEdge, alpha = 0.01, label = pathway)) +
geom_point() +
ggtitle("Gene set size vs. size of leading edge subset")
plotly::ggplotly(size_map)
# Normalized vs Observed Enrichment Score
ggplot(score_df) +
geom_histogram(aes(ES, alpha = 0.05, fill = "ES"),
bins = 50) +
geom_histogram(aes(NES, alpha = 0.05, fill = "NES"),
bins = 50) +
ggtitle("Distribution of ES(S) and NES(S) values") +
scale_fill_manual(name = element_blank(),
values = c("ES" = "red", "NES" = "blue"),
labels = c("ES(S)", "NES(S)")) +
theme(axis.title.x = element_blank())
We see a fairly strong positive correlation between gene set size and the size of the leading edge subset for that gene set, as was expected. The majority of observations have a gene set size under 100 and leading edge subset under 50.
The second plot shows the ES histogram overlayed with the NES histogram. Both, as noted previously, exhibit a bimodal distribution, but we see that the NES distribution has a noticeably wider spread to the data, making extreme values much more apparent.
# Plot fdr and gene set size together
fdr_size_plot <-
ggplot(
data = score_df,
mapping = aes(padj, size, alpha = 0.05)
) +
geom_point(colour = "darkblue") +
ggtitle("False Discovery Rate and Gene Set Size")
plotly::ggplotly(fdr_size_plot)
# FDR distribution for gene sets over 100
large_sets <- score_df %>%
filter(size >= 100)
hist(large_sets$padj)
plot(large_sets$padj, large_sets$size)
cor.test(large_sets$padj, large_sets$size)
##
## Pearson's product-moment correlation
##
## data: large_sets$padj and large_sets$size
## t = 0.037464, df = 141, p-value = 0.9702
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.161077 0.167217
## sample estimates:
## cor
## 0.003155039
There does not seem to be any significant relationship between a gene set’s size and its false discovery rate. Filtering to include only the gene sets with size over 100, it seems that there are significantly more observations with fdr values close to zero; however, the two variables are not linearly correlated even after filtering the data.
# Create enrichment plot for selected gene set/pathway
set <- "REACTOME_RHOA_GTPASE_CYCLE"
enrichment_plot <- fgsea::plotEnrichment(
pathway = pathways[[set]],
stats = mageck_lfc_sort,
gseaParam = 1
)
enrichment_plot
enrichment_plot2 <- fgsea::plotEnrichment(
pathway = pathways[["WP_ENDOCHONDRAL_OSSIFICATION"]],
stats = mageck_lfc_sort,
gseaParam = 1
)
enrichment_plot2
enrichment_plot3 <- fgsea::plotEnrichment(
pathway = pathways[["REACTOME_HOMOLOGY_DIRECTED_REPAIR"]],
stats = mageck_lfc_sort,
gseaParam = 1
)
enrichment_plot3
enrichment_plot4 <- fgsea::plotEnrichment(
pathway = pathways[["REACTOME_G0_AND_EARLY_G1"]],
stats = mageck_lfc_sort,
gseaParam = 1
)
enrichment_plot4
# View pathways with lowest fdr values
gsea_by_fdr <- score_df[order(score_df$padj, decreasing = FALSE),]
head(gsea_by_fdr)
## pathway
## 1: BIOCARTA_MCM_PATHWAY
## 2: BIOCARTA_ATRBRCA_PATHWAY
## 3: REACTOME_TRANSLESION_SYNTHESIS_BY_Y_FAMILY_DNA_POLYMERASES_BYPASSES_LESIONS_ON_DNA_TEMPLATE
## 4: REACTOME_RECOGNITION_OF_DNA_DAMAGE_BY_PCNA_CONTAINING_REPLICATION_COMPLEX
## 5: REACTOME_TRANSLESION_SYNTHESIS_BY_POLH
## 6: REACTOME_RESOLUTION_OF_AP_SITES_VIA_THE_MULTIPLE_NUCLEOTIDE_PATCH_REPLACEMENT_PATHWAY
## pval padj ES NES nMoreExtreme size
## 1: 0.001543210 0.01413043 0.8742624 2.084541 0 13
## 2: 0.001490313 0.01413043 0.7606050 1.992289 0 19
## 3: 0.001388889 0.01413043 0.7902764 2.312725 0 33
## 4: 0.001416431 0.01413043 0.8090642 2.248785 0 26
## 5: 0.001562500 0.01413043 0.8793194 2.199676 0 16
## 6: 0.001466276 0.01413043 0.8198164 2.175772 0 21
## leadingEdge nLeadingEdge
## 1: orc6,orc5,orc2,orc4,mcm7,mcm4,... 11
## 2: chek1,rad9a,rad51,mre11a,rad1,rad50,... 11
## 3: pole,rpa3,rfc5,vcp,pcna,pold2,... 20
## 4: pole,rpa3,rfc5,pcna,wdr48,pold2,... 20
## 5: rpa3,rfc5,vcp,pcna,rfc2,rps27a,... 12
## 6: pole,rpa3,rfc5,pcna,pold2,rfc2,... 18
# View extreme pathways by ES and NES
gsea_by_ES <- score_df[order(score_df$ES, decreasing = TRUE),]
head(gsea_by_ES[,c("pathway", "ES", "NES", "padj")])
## pathway
## 1: REACTOME_CDC6_ASSOCIATION_WITH_THE_ORC_ORIGIN_COMPLEX
## 2: REACTOME_ASSEMBLY_OF_THE_ORC_COMPLEX_AT_THE_ORIGIN_OF_REPLICATION
## 3: REACTOME_PHOSPHORYLATION_OF_EMI1
## 4: REACTOME_SLBP_DEPENDENT_PROCESSING_OF_REPLICATION_DEPENDENT_HISTONE_PRE_MRNAS
## 5: REACTOME_MISMATCH_REPAIR_MMR_DIRECTED_BY_MSH2_MSH3_MUTSBETA
## 6: REACTOME_DNA_STRAND_ELONGATION
## ES NES padj
## 1: 0.9619646 2.026453 0.01484389
## 2: 0.9539566 2.069101 0.01462986
## 3: 0.9291420 1.798997 0.02738644
## 4: 0.9267651 2.060102 0.01446076
## 5: 0.9169753 1.931680 0.01484389
## 6: 0.9045440 2.400637 0.01413043
tail(gsea_by_ES[,c("pathway", "ES", "NES", "padj")])
## pathway
## 1: REACTOME_ASSOCIATION_OF_TRIC_CCT_WITH_TARGET_PROTEINS_DURING_BIOSYNTHESIS
## 2: BIOCARTA_IL22BP_PATHWAY
## 3: BIOCARTA_IFNG_PATHWAY
## 4: BIOCARTA_STAT3_PATHWAY
## 5: BIOCARTA_TERT_PATHWAY
## 6: BIOCARTA_IFNA_PATHWAY
## ES NES padj
## 1: -0.7548374 -1.861285 0.02067350
## 2: -0.7684930 -1.894957 0.02067350
## 3: -0.7987253 -1.726270 0.10005863
## 4: -0.8271995 -2.005774 0.02034230
## 5: -0.8367602 -1.934245 0.02019148
## 6: -0.9699129 -2.351821 0.02034230
gsea_by_NES <- score_df[order(score_df$NES, decreasing = TRUE),]
head(gsea_by_NES[,c("pathway", "ES", "NES", "padj")])
## pathway
## 1: REACTOME_METABOLISM_OF_RNA
## 2: REACTOME_MAJOR_PATHWAY_OF_RRNA_PROCESSING_IN_THE_NUCLEOLUS_AND_CYTOSOL
## 3: REACTOME_HOMOLOGY_DIRECTED_REPAIR
## 4: REACTOME_NONSENSE_MEDIATED_DECAY_NMD
## 5: REACTOME_MRNA_SPLICING
## 6: REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA
## ES NES padj
## 1: 0.7028721 2.876448 0.01413043
## 2: 0.7638078 2.857947 0.01413043
## 3: 0.7856302 2.787720 0.01413043
## 4: 0.7736164 2.770933 0.01413043
## 5: 0.7307581 2.766267 0.01413043
## 6: 0.7087467 2.761222 0.01413043
tail(gsea_by_NES[,c("pathway", "ES", "NES", "padj")])
## pathway ES
## 1: BIOCARTA_TERT_PATHWAY -0.8367602
## 2: WP_DYSREGULATED_MIRNA_TARGETING_IN_INSULINPI3KAKT_SIGNALING -0.5890128
## 3: WP_ENDOCHONDRAL_OSSIFICATION -0.5013328
## 4: BIOCARTA_STAT3_PATHWAY -0.8271995
## 5: REACTOME_CD28_DEPENDENT_PI3K_AKT_SIGNALING -0.7274447
## 6: BIOCARTA_IFNA_PATHWAY -0.9699129
## NES padj
## 1: -1.934245 0.02019148
## 2: -1.970958 0.02602603
## 3: -1.978433 0.03381908
## 4: -2.005774 0.02034230
## 5: -2.321345 0.02429907
## 6: -2.351821 0.02034230
#
# # Look-up by gene code
# gene <- "musk"
# if (length(mageck[mageck$id ==gene,]) != 0){
# print(mageck[mageck$id==gene,])
# } else {
# print("No results found")
# }